from typing import (
  Any,
  TypeVar,
  Protocol,
  AsyncGenerator,
  List,
  Optional,
  Iterable,
  Dict,
  Union,
  Tuple,
  Generic,
  Type,
  Literal,
)
import bentoml
from bentoml._internal.runner.runner_handle import RunnerHandle

from openllm_core import LLMConfig
from openllm_core._typing_compat import M, T, LiteralBackend

from ._llm import LLM

Mo = TypeVar('Mo')
To = TypeVar('To')

__all_ = ['Runner', 'runner']

def runner(llm: LLM[M, T]) -> Runner[M, T]: ...

# class Runner(Protocol[Mo, To]):
#   __doc__: str = ...
#   __module__: str = ...
#   llm: LLM[Mo, To] = ...
#   llm_config: LLMConfig = ...
#   llm_type: str = ...
#   llm_tag: bentoml.Tag = ...
#   llm_bentomodel: bentoml.Model = ...
#   identifying_params: Dict[str, Any] = ...
#   backend: LiteralBackend = ...
#   template: str = ...
#   system_message: str = ...
#
#   @api  # type: ignore[arg-type] # XXX: I don't really know how to fix this for marking positional-only arg as self?
#   async def generate_iterator(
#     self,
#     prompt_token_ids: List[int],
#     request_id: str,
#     stop: Optional[Iterable[str]] = ...,
#     adapter_name: Optional[str] = ...,
#     **attrs: Any,
#   ) -> AsyncGenerator[GenerationOutput, None]: ...

class _Runnable(Protocol[Mo, To]):
  SUPPORTED_RESOURCES: Tuple[Literal['nvidia.com/gpu', 'amd.com/gpu', 'cpu'], ...] = ...
  SUPPORTS_CPU_MULTI_THREADING: bool = ...
  llm: LLM[Mo, To] = ...
  config: LLMConfig = ...
  def __init__(self, llm: LLM[Mo, To]) -> None: ...
  async def generate_iterator(
    self,
    prompt: str,
    request_id: str,
    prompt_token_ids: Optional[List[int]] = ...,
    stop: Optional[Union[str, Iterable[str]]] = ...,
    adapter_name: Optional[str] = ...,
    **attrs: Any,
  ) -> AsyncGenerator[str, None]: ...

In = TypeVar('In')
Ret = TypeVar('Ret')

class RunnerMethod(Generic[In, Ret]): ...

class Runner(Protocol[Mo, To]):
  __doc__: str = ...
  __module__: str = ...
  llm_type: str = ...
  llm_tag: bentoml.Tag = ...
  identifying_params: Dict[str, Any] = ...
  llm: LLM[Mo, To] = ...
  config: LLMConfig = ...
  backend: LiteralBackend = ...
  template: str = ...
  system_message: str = ...

  class generate_iterator(RunnerMethod[List[int], AsyncGenerator[str, None]]):
    @staticmethod
    def async_stream(
      prompt: str,
      request_id: str,
      prompt_token_ids: Optional[List[int]] = ...,
      stop: Optional[Union[Iterable[str], str]] = ...,
      adapter_name: Optional[str] = ...,
      **attrs: Any,
    ) -> AsyncGenerator[str, None]: ...

  def __init__(
    self,
    runnable_class: Type[_Runnable[Mo, To]],
    *,
    runnable_init_params: Optional[Dict[str, Any]] = ...,
    name: Optional[str] = ...,
    scheduling_strategy: Type[bentoml.Strategy] = ...,
    models: Optional[List[bentoml.Model]] = ...,
    max_batch_size: Optional[int] = ...,
    max_latency_ms: Optional[int] = ...,
    method_configs: Optional[Dict[str, Dict[str, int]]] = ...,
    embedded: bool = ...,
  ) -> None: ...

  name: str = ...
  models: List[bentoml.Model] = ...
  resource_config: Dict[str, Any]
  runnable_class: Type[_Runnable[Mo, To]]
  embedded: bool
  runner_methods: List[RunnerMethod[Any, Any]]
  scheduling_strategy: Type[bentoml.Strategy]
  workers_per_resource: Union[int, float] = ...
  runnable_init_params: Dict[str, Any] = ...
  _runner_handle: RunnerHandle = ...

  def init_local(self, quiet: bool = False) -> None: ...
  def init_client(self, handle_class: Optional[Type[RunnerHandle]] = ..., *args: Any, **kwargs: Any) -> None: ...
  async def runner_handle_is_ready(self, timeout: int = ...) -> bool: ...
  def destroy(self) -> None: ...
  @property
  def scheduled_worker_count(self) -> int: ...
  @property
  def scheduled_worker_env_map(self) -> Dict[int, Dict[str, Any]]: ...
